#ifndef RIJNDAEL_GPU_CU
#define RIJNDAEL_GPU_CU

// code to implement Advanced Encryption Standard - Rijndael
#include "Rijndael_GPU.cuh"

#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <string.h>
#include <math.h>

// includes, project
#include <cutil_inline.h>
#include <shrUtils.h>
#include <shrQATest.h>

// includes for cuda auto-complete suport
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "cuPrintf.cu"

//The macro CUPRINTF is defined for architectures
//with different compute capabilities.
#if __CUDA_ARCH__ < 200 	//Compute capability 1.x architectures
#define CUPRINTF cuPrintf 
#else						//Compute capability 2.x architectures
#define CUPRINTF(fmt, ...) printf("[%d, %d]:\t" fmt, \
	blockIdx.y*gridDim.x+blockIdx.x,\
	threadIdx.z*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x,\
	__VA_ARGS__)
#endif


// includes, kernels
#include "Rijndael_GPU_kernel.cu"

/////////////////////////////////////////////////////////////////////////////////////
// internally data is stored in the state in order
//   0  1  2  3
//   4  5  6  7  
//   8  9 10 11
//   ...
// up to Nb of these
// NOTE: thus rows and columns are interchanged from the paper

using namespace std;

namespace GPU { // anonymous namespace

	// have the tables been initialized?
	bool tablesInitialized = false;

	// constants defining the algorithm
	int const gf2_8_poly = 0x11B; // the poly defining the 256 element field
	// poly defining mixing, coeffs usually '03010102'
	const unsigned long poly32 = 0x03010102; 
	// poly inverse, coeffs usually '0B0D090E'
	const unsigned long poly32_inv = 0x0B0D090E; 

	int const parameters[] = { // data in  Nr,C1,C2,C3 form
		//Nk*32 128         192         256
		10,1,2,3, 	12,1,2,3,  	14,1,2,3,   // Nb*32 = 128
		12,1,2,3, 	12,1,2,3,  	14,1,2,3,   // Nb*32 = 192
		14,1,3,4, 	14,1,3,4,  	14,1,3,4,   // Nb*32 = 256
	};

	/*
	unsigned char shift_row_map[16] = {
	0x0,0x1,0x2,0x3,
	0x4,0x5,0x6,0x7,
	0x8,0x9,0xA,0xB,
	0xC,0xD,0xE,0xF,
	};
	*/

	//TODO: make 24 & 32 bytes version of this maps
	unsigned char shift_row_map[16] = {
		0x0,0x5,0xA,0xF,
		0x4,0x9,0xE,0x3,
		0x8,0xD,0x2,0x7,
		0xC,0x1,0x6,0xB,
	};

	unsigned char inv_shift_row_map[16] = { // TODO copy to const memory
		0x0,0xD,0xA,0x7,
		0x4,0x1,0xE,0xB,
		0x8,0x5,0x2,0xF,
		0xC,0x9,0x6,0x3,
	};

	// tables for inverses, byte sub
	unsigned char gf2_8_inv[256] = {
		0x00,0x01,0x8d,0xf6,0xcb,0x52,0x7b,0xd1,0xe8,0x4f,0x29,0xc0,0xb0,0xe1,0xe5,0xc7,
		0x74,0xb4,0xaa,0x4b,0x99,0x2b,0x60,0x5f,0x58,0x3f,0xfd,0xcc,0xff,0x40,0xee,0xb2,
		0x3a,0x6e,0x5a,0xf1,0x55,0x4d,0xa8,0xc9,0xc1,0x0a,0x98,0x15,0x30,0x44,0xa2,0xc2,
		0x2c,0x45,0x92,0x6c,0xf3,0x39,0x66,0x42,0xf2,0x35,0x20,0x6f,0x77,0xbb,0x59,0x19,
		0x1d,0xfe,0x37,0x67,0x2d,0x31,0xf5,0x69,0xa7,0x64,0xab,0x13,0x54,0x25,0xe9,0x09,
		0xed,0x5c,0x05,0xca,0x4c,0x24,0x87,0xbf,0x18,0x3e,0x22,0xf0,0x51,0xec,0x61,0x17,
		0x16,0x5e,0xaf,0xd3,0x49,0xa6,0x36,0x43,0xf4,0x47,0x91,0xdf,0x33,0x93,0x21,0x3b,
		0x79,0xb7,0x97,0x85,0x10,0xb5,0xba,0x3c,0xb6,0x70,0xd0,0x06,0xa1,0xfa,0x81,0x82,
		0x83,0x7e,0x7f,0x80,0x96,0x73,0xbe,0x56,0x9b,0x9e,0x95,0xd9,0xf7,0x02,0xb9,0xa4,
		0xde,0x6a,0x32,0x6d,0xd8,0x8a,0x84,0x72,0x2a,0x14,0x9f,0x88,0xf9,0xdc,0x89,0x9a,
		0xfb,0x7c,0x2e,0xc3,0x8f,0xb8,0x65,0x48,0x26,0xc8,0x12,0x4a,0xce,0xe7,0xd2,0x62,
		0x0c,0xe0,0x1f,0xef,0x11,0x75,0x78,0x71,0xa5,0x8e,0x76,0x3d,0xbd,0xbc,0x86,0x57,
		0x0b,0x28,0x2f,0xa3,0xda,0xd4,0xe4,0x0f,0xa9,0x27,0x53,0x04,0x1b,0xfc,0xac,0xe6,
		0x7a,0x07,0xae,0x63,0xc5,0xdb,0xe2,0xea,0x94,0x8b,0xc4,0xd5,0x9d,0xf8,0x90,0x6b,
		0xb1,0x0d,0xd6,0xeb,0xc6,0x0e,0xcf,0xad,0x08,0x4e,0xd7,0xe3,0x5d,0x50,0x1e,0xb3,
		0x5b,0x23,0x38,0x34,0x68,0x46,0x03,0x8c,0xdd,0x9c,0x7d,0xa0,0xcd,0x1a,0x41,0x1c,
	};

	unsigned char byte_sub[256]= {
		0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
		0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
		0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
		0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
		0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
		0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
		0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
		0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
		0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
		0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
		0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
		0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
		0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
		0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
		0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
		0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16,
	}; 

	unsigned char inv_byte_sub[256]= {
		0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38,0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb,
		0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87,0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb,
		0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d,0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e,
		0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2,0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25,
		0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16,0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92,
		0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda,0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84,
		0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a,0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06,
		0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02,0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b,
		0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea,0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73,
		0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85,0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e,
		0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89,0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b,
		0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20,0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4,
		0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31,0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f,
		0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d,0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef,
		0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0,0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61,
		0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26,0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d,
	}; 

	// this table needs Nb*(Nr+1)/Nk entries - up to 8*(15)/4 = 60
	unsigned long Rcon[60] = { // todo -  this table may be stored as bytes or made on the fly
		0x00000000,0x00000001,0x00000002,0x00000004,0x00000008,0x00000010,0x00000020,0x00000040,
		0x00000080,0x0000001b,0x00000036,0x0000006c,0x000000d8,0x000000ab,0x0000004d,0x0000009a,
		0x0000002f,0x0000005e,0x000000bc,0x00000063,0x000000c6,0x00000097,0x00000035,0x0000006a,
		0x000000d4,0x000000b3,0x0000007d,0x000000fa,0x000000ef,0x000000c5,0x00000091,0x00000039,
		0x00000072,0x000000e4,0x000000d3,0x000000bd,0x00000061,0x000000c2,0x0000009f,0x00000025,
		0x0000004a,0x00000094,0x00000033,0x00000066,0x000000cc,0x00000083,0x0000001d,0x0000003a,
		0x00000074,0x000000e8,0x000000cb,0x0000008d,0x00000001,0x00000002,0x00000004,0x00000008,
		0x00000010,0x00000020,0x00000040,0x0000001b,	
	};


	//Multiply by 2:
	unsigned char mult_by_2[256]= {
		0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e,0x10,0x12,0x14,0x16,0x18,0x1a,0x1c,0x1e,
		0x20,0x22,0x24,0x26,0x28,0x2a,0x2c,0x2e,0x30,0x32,0x34,0x36,0x38,0x3a,0x3c,0x3e,
		0x40,0x42,0x44,0x46,0x48,0x4a,0x4c,0x4e,0x50,0x52,0x54,0x56,0x58,0x5a,0x5c,0x5e,
		0x60,0x62,0x64,0x66,0x68,0x6a,0x6c,0x6e,0x70,0x72,0x74,0x76,0x78,0x7a,0x7c,0x7e,
		0x80,0x82,0x84,0x86,0x88,0x8a,0x8c,0x8e,0x90,0x92,0x94,0x96,0x98,0x9a,0x9c,0x9e,
		0xa0,0xa2,0xa4,0xa6,0xa8,0xaa,0xac,0xae,0xb0,0xb2,0xb4,0xb6,0xb8,0xba,0xbc,0xbe,
		0xc0,0xc2,0xc4,0xc6,0xc8,0xca,0xcc,0xce,0xd0,0xd2,0xd4,0xd6,0xd8,0xda,0xdc,0xde,
		0xe0,0xe2,0xe4,0xe6,0xe8,0xea,0xec,0xee,0xf0,0xf2,0xf4,0xf6,0xf8,0xfa,0xfc,0xfe,
		0x1b,0x19,0x1f,0x1d,0x13,0x11,0x17,0x15,0x0b,0x09,0x0f,0x0d,0x03,0x01,0x07,0x05,
		0x3b,0x39,0x3f,0x3d,0x33,0x31,0x37,0x35,0x2b,0x29,0x2f,0x2d,0x23,0x21,0x27,0x25,
		0x5b,0x59,0x5f,0x5d,0x53,0x51,0x57,0x55,0x4b,0x49,0x4f,0x4d,0x43,0x41,0x47,0x45,
		0x7b,0x79,0x7f,0x7d,0x73,0x71,0x77,0x75,0x6b,0x69,0x6f,0x6d,0x63,0x61,0x67,0x65,
		0x9b,0x99,0x9f,0x9d,0x93,0x91,0x97,0x95,0x8b,0x89,0x8f,0x8d,0x83,0x81,0x87,0x85,
		0xbb,0xb9,0xbf,0xbd,0xb3,0xb1,0xb7,0xb5,0xab,0xa9,0xaf,0xad,0xa3,0xa1,0xa7,0xa5,
		0xdb,0xd9,0xdf,0xdd,0xd3,0xd1,0xd7,0xd5,0xcb,0xc9,0xcf,0xcd,0xc3,0xc1,0xc7,0xc5,
		0xfb,0xf9,0xff,0xfd,0xf3,0xf1,0xf7,0xf5,0xeb,0xe9,0xef,0xed,0xe3,0xe1,0xe7,0xe5 };

	//Multiply by 3:
	unsigned char mult_by_3[256]= {
		0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09,0x18,0x1b,0x1e,0x1d,0x14,0x17,0x12,0x11,
		0x30,0x33,0x36,0x35,0x3c,0x3f,0x3a,0x39,0x28,0x2b,0x2e,0x2d,0x24,0x27,0x22,0x21,
		0x60,0x63,0x66,0x65,0x6c,0x6f,0x6a,0x69,0x78,0x7b,0x7e,0x7d,0x74,0x77,0x72,0x71,
		0x50,0x53,0x56,0x55,0x5c,0x5f,0x5a,0x59,0x48,0x4b,0x4e,0x4d,0x44,0x47,0x42,0x41,
		0xc0,0xc3,0xc6,0xc5,0xcc,0xcf,0xca,0xc9,0xd8,0xdb,0xde,0xdd,0xd4,0xd7,0xd2,0xd1,
		0xf0,0xf3,0xf6,0xf5,0xfc,0xff,0xfa,0xf9,0xe8,0xeb,0xee,0xed,0xe4,0xe7,0xe2,0xe1,
		0xa0,0xa3,0xa6,0xa5,0xac,0xaf,0xaa,0xa9,0xb8,0xbb,0xbe,0xbd,0xb4,0xb7,0xb2,0xb1,
		0x90,0x93,0x96,0x95,0x9c,0x9f,0x9a,0x99,0x88,0x8b,0x8e,0x8d,0x84,0x87,0x82,0x81,
		0x9b,0x98,0x9d,0x9e,0x97,0x94,0x91,0x92,0x83,0x80,0x85,0x86,0x8f,0x8c,0x89,0x8a,
		0xab,0xa8,0xad,0xae,0xa7,0xa4,0xa1,0xa2,0xb3,0xb0,0xb5,0xb6,0xbf,0xbc,0xb9,0xba,
		0xfb,0xf8,0xfd,0xfe,0xf7,0xf4,0xf1,0xf2,0xe3,0xe0,0xe5,0xe6,0xef,0xec,0xe9,0xea,
		0xcb,0xc8,0xcd,0xce,0xc7,0xc4,0xc1,0xc2,0xd3,0xd0,0xd5,0xd6,0xdf,0xdc,0xd9,0xda,
		0x5b,0x58,0x5d,0x5e,0x57,0x54,0x51,0x52,0x43,0x40,0x45,0x46,0x4f,0x4c,0x49,0x4a,
		0x6b,0x68,0x6d,0x6e,0x67,0x64,0x61,0x62,0x73,0x70,0x75,0x76,0x7f,0x7c,0x79,0x7a,
		0x3b,0x38,0x3d,0x3e,0x37,0x34,0x31,0x32,0x23,0x20,0x25,0x26,0x2f,0x2c,0x29,0x2a,
		0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,0x13,0x10,0x15,0x16,0x1f,0x1c,0x19,0x1a };

	//Multiply by 9:
	unsigned char mult_by_9[256]= {
		0x00,0x09,0x12,0x1b,0x24,0x2d,0x36,0x3f,0x48,0x41,0x5a,0x53,0x6c,0x65,0x7e,0x77,
		0x90,0x99,0x82,0x8b,0xb4,0xbd,0xa6,0xaf,0xd8,0xd1,0xca,0xc3,0xfc,0xf5,0xee,0xe7,
		0x3b,0x32,0x29,0x20,0x1f,0x16,0x0d,0x04,0x73,0x7a,0x61,0x68,0x57,0x5e,0x45,0x4c,
		0xab,0xa2,0xb9,0xb0,0x8f,0x86,0x9d,0x94,0xe3,0xea,0xf1,0xf8,0xc7,0xce,0xd5,0xdc,
		0x76,0x7f,0x64,0x6d,0x52,0x5b,0x40,0x49,0x3e,0x37,0x2c,0x25,0x1a,0x13,0x08,0x01,
		0xe6,0xef,0xf4,0xfd,0xc2,0xcb,0xd0,0xd9,0xae,0xa7,0xbc,0xb5,0x8a,0x83,0x98,0x91,
		0x4d,0x44,0x5f,0x56,0x69,0x60,0x7b,0x72,0x05,0x0c,0x17,0x1e,0x21,0x28,0x33,0x3a,
		0xdd,0xd4,0xcf,0xc6,0xf9,0xf0,0xeb,0xe2,0x95,0x9c,0x87,0x8e,0xb1,0xb8,0xa3,0xaa,
		0xec,0xe5,0xfe,0xf7,0xc8,0xc1,0xda,0xd3,0xa4,0xad,0xb6,0xbf,0x80,0x89,0x92,0x9b,
		0x7c,0x75,0x6e,0x67,0x58,0x51,0x4a,0x43,0x34,0x3d,0x26,0x2f,0x10,0x19,0x02,0x0b,
		0xd7,0xde,0xc5,0xcc,0xf3,0xfa,0xe1,0xe8,0x9f,0x96,0x8d,0x84,0xbb,0xb2,0xa9,0xa0,
		0x47,0x4e,0x55,0x5c,0x63,0x6a,0x71,0x78,0x0f,0x06,0x1d,0x14,0x2b,0x22,0x39,0x30,
		0x9a,0x93,0x88,0x81,0xbe,0xb7,0xac,0xa5,0xd2,0xdb,0xc0,0xc9,0xf6,0xff,0xe4,0xed,
		0x0a,0x03,0x18,0x11,0x2e,0x27,0x3c,0x35,0x42,0x4b,0x50,0x59,0x66,0x6f,0x74,0x7d,
		0xa1,0xa8,0xb3,0xba,0x85,0x8c,0x97,0x9e,0xe9,0xe0,0xfb,0xf2,0xcd,0xc4,0xdf,0xd6,
		0x31,0x38,0x23,0x2a,0x15,0x1c,0x07,0x0e,0x79,0x70,0x6b,0x62,0x5d,0x54,0x4f,0x46 };

	//Multiply by 11:
	unsigned char mult_by_11[256]= {
		0x00,0x0b,0x16,0x1d,0x2c,0x27,0x3a,0x31,0x58,0x53,0x4e,0x45,0x74,0x7f,0x62,0x69,
		0xb0,0xbb,0xa6,0xad,0x9c,0x97,0x8a,0x81,0xe8,0xe3,0xfe,0xf5,0xc4,0xcf,0xd2,0xd9,
		0x7b,0x70,0x6d,0x66,0x57,0x5c,0x41,0x4a,0x23,0x28,0x35,0x3e,0x0f,0x04,0x19,0x12,
		0xcb,0xc0,0xdd,0xd6,0xe7,0xec,0xf1,0xfa,0x93,0x98,0x85,0x8e,0xbf,0xb4,0xa9,0xa2,
		0xf6,0xfd,0xe0,0xeb,0xda,0xd1,0xcc,0xc7,0xae,0xa5,0xb8,0xb3,0x82,0x89,0x94,0x9f,
		0x46,0x4d,0x50,0x5b,0x6a,0x61,0x7c,0x77,0x1e,0x15,0x08,0x03,0x32,0x39,0x24,0x2f,
		0x8d,0x86,0x9b,0x90,0xa1,0xaa,0xb7,0xbc,0xd5,0xde,0xc3,0xc8,0xf9,0xf2,0xef,0xe4,
		0x3d,0x36,0x2b,0x20,0x11,0x1a,0x07,0x0c,0x65,0x6e,0x73,0x78,0x49,0x42,0x5f,0x54,
		0xf7,0xfc,0xe1,0xea,0xdb,0xd0,0xcd,0xc6,0xaf,0xa4,0xb9,0xb2,0x83,0x88,0x95,0x9e,
		0x47,0x4c,0x51,0x5a,0x6b,0x60,0x7d,0x76,0x1f,0x14,0x09,0x02,0x33,0x38,0x25,0x2e,
		0x8c,0x87,0x9a,0x91,0xa0,0xab,0xb6,0xbd,0xd4,0xdf,0xc2,0xc9,0xf8,0xf3,0xee,0xe5,
		0x3c,0x37,0x2a,0x21,0x10,0x1b,0x06,0x0d,0x64,0x6f,0x72,0x79,0x48,0x43,0x5e,0x55,
		0x01,0x0a,0x17,0x1c,0x2d,0x26,0x3b,0x30,0x59,0x52,0x4f,0x44,0x75,0x7e,0x63,0x68,
		0xb1,0xba,0xa7,0xac,0x9d,0x96,0x8b,0x80,0xe9,0xe2,0xff,0xf4,0xc5,0xce,0xd3,0xd8,
		0x7a,0x71,0x6c,0x67,0x56,0x5d,0x40,0x4b,0x22,0x29,0x34,0x3f,0x0e,0x05,0x18,0x13,
		0xca,0xc1,0xdc,0xd7,0xe6,0xed,0xf0,0xfb,0x92,0x99,0x84,0x8f,0xbe,0xb5,0xa8,0xa3 };

	//Multiply by 13:
	unsigned char mult_by_13[256]= {
		0x00,0x0d,0x1a,0x17,0x34,0x39,0x2e,0x23,0x68,0x65,0x72,0x7f,0x5c,0x51,0x46,0x4b,
		0xd0,0xdd,0xca,0xc7,0xe4,0xe9,0xfe,0xf3,0xb8,0xb5,0xa2,0xaf,0x8c,0x81,0x96,0x9b,
		0xbb,0xb6,0xa1,0xac,0x8f,0x82,0x95,0x98,0xd3,0xde,0xc9,0xc4,0xe7,0xea,0xfd,0xf0,
		0x6b,0x66,0x71,0x7c,0x5f,0x52,0x45,0x48,0x03,0x0e,0x19,0x14,0x37,0x3a,0x2d,0x20,
		0x6d,0x60,0x77,0x7a,0x59,0x54,0x43,0x4e,0x05,0x08,0x1f,0x12,0x31,0x3c,0x2b,0x26,
		0xbd,0xb0,0xa7,0xaa,0x89,0x84,0x93,0x9e,0xd5,0xd8,0xcf,0xc2,0xe1,0xec,0xfb,0xf6,
		0xd6,0xdb,0xcc,0xc1,0xe2,0xef,0xf8,0xf5,0xbe,0xb3,0xa4,0xa9,0x8a,0x87,0x90,0x9d,
		0x06,0x0b,0x1c,0x11,0x32,0x3f,0x28,0x25,0x6e,0x63,0x74,0x79,0x5a,0x57,0x40,0x4d,
		0xda,0xd7,0xc0,0xcd,0xee,0xe3,0xf4,0xf9,0xb2,0xbf,0xa8,0xa5,0x86,0x8b,0x9c,0x91,
		0x0a,0x07,0x10,0x1d,0x3e,0x33,0x24,0x29,0x62,0x6f,0x78,0x75,0x56,0x5b,0x4c,0x41,
		0x61,0x6c,0x7b,0x76,0x55,0x58,0x4f,0x42,0x09,0x04,0x13,0x1e,0x3d,0x30,0x27,0x2a,
		0xb1,0xbc,0xab,0xa6,0x85,0x88,0x9f,0x92,0xd9,0xd4,0xc3,0xce,0xed,0xe0,0xf7,0xfa,
		0xb7,0xba,0xad,0xa0,0x83,0x8e,0x99,0x94,0xdf,0xd2,0xc5,0xc8,0xeb,0xe6,0xf1,0xfc,
		0x67,0x6a,0x7d,0x70,0x53,0x5e,0x49,0x44,0x0f,0x02,0x15,0x18,0x3b,0x36,0x21,0x2c,
		0x0c,0x01,0x16,0x1b,0x38,0x35,0x22,0x2f,0x64,0x69,0x7e,0x73,0x50,0x5d,0x4a,0x47,
		0xdc,0xd1,0xc6,0xcb,0xe8,0xe5,0xf2,0xff,0xb4,0xb9,0xae,0xa3,0x80,0x8d,0x9a,0x97 };

	//Multiply by 14:
	unsigned char mult_by_14[256]= {
		0x00,0x0e,0x1c,0x12,0x38,0x36,0x24,0x2a,0x70,0x7e,0x6c,0x62,0x48,0x46,0x54,0x5a,
		0xe0,0xee,0xfc,0xf2,0xd8,0xd6,0xc4,0xca,0x90,0x9e,0x8c,0x82,0xa8,0xa6,0xb4,0xba,
		0xdb,0xd5,0xc7,0xc9,0xe3,0xed,0xff,0xf1,0xab,0xa5,0xb7,0xb9,0x93,0x9d,0x8f,0x81,
		0x3b,0x35,0x27,0x29,0x03,0x0d,0x1f,0x11,0x4b,0x45,0x57,0x59,0x73,0x7d,0x6f,0x61,
		0xad,0xa3,0xb1,0xbf,0x95,0x9b,0x89,0x87,0xdd,0xd3,0xc1,0xcf,0xe5,0xeb,0xf9,0xf7,
		0x4d,0x43,0x51,0x5f,0x75,0x7b,0x69,0x67,0x3d,0x33,0x21,0x2f,0x05,0x0b,0x19,0x17,
		0x76,0x78,0x6a,0x64,0x4e,0x40,0x52,0x5c,0x06,0x08,0x1a,0x14,0x3e,0x30,0x22,0x2c,
		0x96,0x98,0x8a,0x84,0xae,0xa0,0xb2,0xbc,0xe6,0xe8,0xfa,0xf4,0xde,0xd0,0xc2,0xcc,
		0x41,0x4f,0x5d,0x53,0x79,0x77,0x65,0x6b,0x31,0x3f,0x2d,0x23,0x09,0x07,0x15,0x1b,
		0xa1,0xaf,0xbd,0xb3,0x99,0x97,0x85,0x8b,0xd1,0xdf,0xcd,0xc3,0xe9,0xe7,0xf5,0xfb,
		0x9a,0x94,0x86,0x88,0xa2,0xac,0xbe,0xb0,0xea,0xe4,0xf6,0xf8,0xd2,0xdc,0xce,0xc0,
		0x7a,0x74,0x66,0x68,0x42,0x4c,0x5e,0x50,0x0a,0x04,0x16,0x18,0x32,0x3c,0x2e,0x20,
		0xec,0xe2,0xf0,0xfe,0xd4,0xda,0xc8,0xc6,0x9c,0x92,0x80,0x8e,0xa4,0xaa,0xb8,0xb6,
		0x0c,0x02,0x10,0x1e,0x34,0x3a,0x28,0x26,0x7c,0x72,0x60,0x6e,0x44,0x4a,0x58,0x56,
		0x37,0x39,0x2b,0x25,0x0f,0x01,0x13,0x1d,0x47,0x49,0x5b,0x55,0x7f,0x71,0x63,0x6d,
		0xd7,0xd9,0xcb,0xc5,0xef,0xe1,0xf3,0xfd,0xa7,0xa9,0xbb,0xb5,0x9f,0x91,0x83,0x8d };

	// mult 2 elements using gf2_8_poly as a reduction
	unsigned char GF2_8_mult(unsigned char a, unsigned char b)
	{ // todo - make 4x4 table for nibbles, use lookup
		unsigned char result = 0;

		// should give 0x57 . 0x13 = 0xFE with poly 0x11B
		// 

		int count = 8;
		while (count--)
		{
			if (b&1)
				result ^= a;
			if (a&128)
			{
				a <<= 1;

				a ^= (gf2_8_poly&255);
			}
			else
				a <<= 1;
			b >>= 1;
		}
		return result;
	} // GF2_8_mult

	// some functions to create/verify table integrity
	bool CheckInverses(bool create)
	{
		// we'll brute force the inverse table
		assert(GF2_8_mult(0x57,0x13) == 0xFE); // test these first
		assert(GF2_8_mult(0x01,0x01) == 0x01);
		assert(GF2_8_mult(0xFF,0x55) == 0xF8);


		unsigned int a,b; // need int here to prevent wraps in loop
		if (create == true)
			const_cast<unsigned char*>(gf2_8_inv)[0] = 0;
		else if (gf2_8_inv[0] != 0)
			return false;
		for (a = 1; a <= 255; a++)
		{
			b = 1;
			while (GF2_8_mult(a,b) != 1)
				b++;

			if (create == true)
				const_cast<unsigned char *>(gf2_8_inv)[a] = b;
			else if (gf2_8_inv[a] != b)
				return false;
		}
		return true;
	} // CheckInverses

	unsigned char BitSum(unsigned char byte) 
	{ // return the sum of bits mod 2
		byte = (byte>>4)^(byte&15);
		byte = (byte>>2)^(byte&3);
		return (byte>>1)^(byte&1);
	} // BitSum

	bool CheckByteSub(bool create)
	{
		if (CheckInverses(create) == false)
			return false; // we cannot do this without inverses

		unsigned int x,y; // need ints here to prevent wrap in loop
		for (x = 0; x <= 255; x++)
		{
			y = gf2_8_inv[x]; // inverse to start with

			// affine transform
			y = BitSum(y&0xF1) | (BitSum(y&0xE3)<<1) | (BitSum(y&0xC7)<<2) | (BitSum(y&0x8F)<<3) |
				(BitSum(y&0x1F)<<4) | (BitSum(y&0x3E)<<5) | (BitSum(y&0x7C)<<6) | (BitSum(y&0xF8)<<7);
			y = y ^ 0x63;
			if (create == true)
				const_cast<unsigned char *>(byte_sub)[x] = y;
			else if (byte_sub[x] != y)
				return false;
		}
		return true;
	} // CheckByteSub

	bool CheckInvByteSub(bool create)
	{
		if (CheckInverses(create) == false)
			return false; // we cannot do this without inverses
		if (CheckByteSub(create) == false)
			return false; // we cannot do this without byte_sub

		unsigned int x,y; // need ints here to prevent wrap in loop
		for (x = 0; x <= 255; x++)
		{
			// we brute force it...
			y = 0;
			while (byte_sub[y] != x)
				y++;
			if (create == true)
				const_cast<unsigned char *>(inv_byte_sub)[x] = y;
			else if (inv_byte_sub[x] != y)
				return false;
		}
		return true;
	} // CheckInvByteSub

	bool CheckRcon(bool create)
	{
		unsigned char Ri = 1; // start here

		if (create == true)
			Rcon[0] = 0;
		else if (Rcon[0] != 0)
			return false; // todo - this is unused still check?
		for (int i = 1; i < sizeof(Rcon)/sizeof(Rcon[0])-1; i++)
		{
			if (create == true)
				Rcon[i] = Ri;
			else if (Rcon[i] != Ri)
				return false;
			Ri = GF2_8_mult(Ri,0x02); // multiply by x - todo replace with xmult
		}
		return true;
	} // CheckRCon

	void DumpCharTable(ostream & out, const char * name, const unsigned char * table, int length)
	{ // dump te contents of a table to a file
		int pos;
		out << name << endl << hex;
		for (pos = 0; pos < length; pos++)
		{
			out << "0x";
			if (table[pos] < 16)
				out << '0';
			out << static_cast<unsigned int>(table[pos]) << ',';
			if ((pos %16) == 15)
				out << endl;
		}
		out << dec;
	} // DumpCharTable

	void DumpLongTable(ostream & out, const char * name, const unsigned long * table, int length)
	{ // dump te contents of a table to a file
		int pos;
		out << name << endl << hex;
		for (pos = 0; pos < length; pos++)
		{
			out << "0x";
			if (table[pos] < 16)
				out << '0';
			if (table[pos] < 16*16)
				out << '0';
			if (table[pos] < 16*16*16)
				out << '0';
			if (table[pos] < 16*16*16*16)
				out << '0';
			if (table[pos] < 16*16*16*16*16)
				out << '0';
			if (table[pos] < 16*16*16*16*16*16)
				out << '0';
			if (table[pos] < 16*16*16*16*16*16*16)
				out << '0';
			out << static_cast<unsigned int>(table[pos]) << ',';
			if ((pos % 8) == 7)
				out << endl;
		}
		out << dec;
	} // DumpCharTable

	// return true iff tables are valid. create = true fills them in if not
	bool CreateRijndaelTables(bool create, bool create_file)
	{
		bool retval = true;
		if (CheckInverses(create) == false)
			retval = false;
		if (CheckByteSub(create) == false)
			retval = false;
		if (CheckInvByteSub(create) == false)
			retval = false;
		if (CheckRcon(create) == false)
			return false;

		if (create_file == true)
		{ // dump tables
			ofstream out;
			out.open("Tables.dat");
			if (out.is_open() == true)
			{
				DumpCharTable(out,"gf2_8_inv", gf2_8_inv, 256);
				out << "\n\n";
				DumpCharTable(out,"byte_sub", byte_sub, 256);
				out << "\n\n";
				DumpCharTable(out,"inv_byte_sub", inv_byte_sub, 256);
				out << "\n\n";
				DumpLongTable(out,"RCon", Rcon, 60);
				out.close();
			}
		}
		return retval;
	} // CreateRijndaelTables

	void DumpHex(const unsigned char * table, int length)
	{ // dump some hex values for debugging
		int pos;
		cerr << hex;
		for (pos = 0; pos < length; pos++)
		{
			if (table[pos] < 16)
				cerr << '0';
			cerr << static_cast<unsigned int>(table[pos]) << ' ';
			if ((pos %16) == 15)
				cerr << endl;
		}
		cerr << dec;
	} // DumpHex

	void runKernelTest() 
	{
		int devID;
		cudaDeviceProp deviceProps;
		devID = cutGetMaxGflopsDeviceId();
		cudaSetDevice( devID );

		// get number of SMs on this GPU
		cutilSafeCall(cudaGetDeviceProperties(&deviceProps, devID));
		printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);

		unsigned int timer = 0;
		cutilCheckError( cutCreateTimer( &timer));
		cutilCheckError( cutStartTimer( timer));

		unsigned int num_threads = 1;
		unsigned int mem_size = sizeof( float) * num_threads;


		//////////////////////////////////////////////////////////////////
		//Architectures with compute capability 1.x, function
		//cuPrintf() is used. Otherwise, function printf() is called.
		bool use_cuPrintf = (deviceProps.major < 2);

		if (use_cuPrintf)
		{
			//Initializaton, allocate buffers on both host
			//and device for data to be printed.
			cudaPrintfInit();
			//printf("cuPrintf() is called.\n");
		}
		/////////////////////////////////////////////////////////////////


		// allocate host memory
		int* h_idata = (int*) malloc( mem_size);
		// initalize the memory
		for( unsigned int i = 0; i < num_threads; ++i) 
		{
			h_idata[i] = (int) i;
		}

		// allocate device memory
		int* d_idata;
		cutilSafeCall( cudaMalloc( (void**) &d_idata, mem_size));
		// copy host memory to device
		cutilSafeCall( cudaMemcpy( d_idata, h_idata, mem_size,
			cudaMemcpyHostToDevice) );

		//cutilSafeCall( cudaMemcpyToSymbol( "d_byte_sub_const", &byte_sub, 256) );

		// allocate device memory for result
		int* d_odata;
		cutilSafeCall( cudaMalloc( (void**) &d_odata, mem_size));

		// setup execution parameters
		dim3  grid( 1, 1, 1);
		dim3  threads( num_threads, 1, 1);

		printf("launching GPU kernel...\n");
		// execute the kernel
		testKernel<<< grid, threads, mem_size >>>( d_idata, d_odata);
		cutilDeviceSynchronize(); // ???

		// check if kernel execution generated and error
		cutilCheckMsg("Kernel execution failed");


		/////////////////////////////////////////////////////////////
		if (use_cuPrintf)
		{
			//Dump current contents of output buffer to standard 
			//output, and origin (block id and thread id) of each line 
			//of output is enabled(true).
			cudaPrintfDisplay(stdout, true);

			//Free allocated buffers by cudaPrintfInit().
			cudaPrintfEnd();
		}
		//////////////////////////////////////////////////////////////



		// allocate mem for the result on host side
		int* h_odata = (int*) malloc( mem_size);
		// copy result from device to host
		cutilSafeCall( cudaMemcpy( h_odata, d_odata, sizeof(int) * num_threads,
			cudaMemcpyDeviceToHost) );

		cutilCheckError( cutStopTimer( timer));
		printf( "GPU Processing time: %f (ms)\n", cutGetTimerValue( timer));
		cutilCheckError( cutDeleteTimer( timer));

		// cleanup memory
		free( h_idata);
		free( h_odata);
		cutilSafeCall(cudaFree(d_idata));
		cutilSafeCall(cudaFree(d_odata));

		cutilDeviceReset();
	}	
} // end of GPU namespace

using namespace GPU;

// the transforms
void Rijndael_GPU::ByteSub(void)
{
	for (int pos = 0; pos < state_size; pos++)
		state[pos] = byte_sub[state[pos]];
} // ByteSub

void Rijndael_GPU::InvByteSub(void)
{
	unsigned char * s = state;
	for (int pos = 0; pos < state_size; pos++)
		*s = inv_byte_sub[*s++];
} // InvByteSub

void Rijndael_GPU::ShiftRow(void)
{
	unsigned char arr[10];
	int i,j;

	// copy out row, then copy back 2 pieces shifted
	for (j=0,i = 1; j < Nb; i += 4,j++)
		arr[j] = state[i];
	for (j=C1,i = 1; j < Nb; i += 4,j++)
		state[i] = arr[j];
	for (j=0,i = 1 + 4*(Nb-C1); j < C1; i += 4,j++)
		state[i] = arr[j];

	for (j=0,i = 2; j < Nb; i += 4,j++)
		arr[j] = state[i];
	for (j=C2,i = 2; j < Nb; i += 4,j++)
		state[i] = arr[j];
	for (j=0,i = 2 + 4*(Nb-C2); j < C2; i += 4,j++)
		state[i] = arr[j];

	for (j=0,i = 3; j < Nb; i += 4,j++)
		arr[j] = state[i];
	for (j=C3,i = 3; j < Nb; i += 4,j++)
		state[i] = arr[j];
	for (j=0,i = 3 + 4*(Nb-C3); j < C3; i += 4,j++)
		state[i] = arr[j];
} // ShiftRow

void Rijndael_GPU::InvShiftRow(void)
{
	unsigned char arr[10];
	int i,j;

	for (j=0,i = 1; j < Nb; i += 4,j++)
		arr[j] = state[i];
	for (j=Nb-C1,i = 1; j < Nb; i += 4,j++)
		state[i] = arr[j];
	for (j=0,i = 1 + 4*C1; j < Nb-C1; i += 4,j++)
		state[i] = arr[j];

	for (j=0,i = 2; j < Nb; i += 4,j++)
		arr[j] = state[i];
	for (j=Nb-C2,i = 2; j < Nb; i += 4,j++)
		state[i] = arr[j];
	for (j=0,i = 2 + 4*C2; j < Nb-C2; i += 4,j++)
		state[i] = arr[j];

	for (j=0,i = 3; j < Nb; i += 4,j++)
		arr[j] = state[i];
	for (j=Nb-C3,i = 3; j < Nb; i += 4,j++)
		state[i] = arr[j];
	for (j=0,i = 3 + 4*C3; j < Nb-C3; i += 4,j++)
		state[i] = arr[j];
} // InvShiftRow

#define xmult(a) ((a)<<1) ^ (((a)&128) ? 0x01B : 0)

void Rijndael_GPU::MixColumn(void)
{ // poly32 used here - we hard coded - todo - use defines
	unsigned char a0,a1,a2,a3,b0,b1,b2,b3;
	for (int col = 0; col < Nb; col++)
	{
		a0 = state[col*4+0];
		a1 = state[col*4+1];
		a2 = state[col*4+2];
		a3 = state[col*4+3];

		// todo - this could be sped up with a 2 = xmult function, and 3 = xmult(a)^a
		b0 = xmult(a0)^a1^xmult(a1)^a2^a3;
		b1 = a0^xmult(a1)^a2^xmult(a2)^a3;
		b2 = a0^a1^xmult(a2)^a3^xmult(a3);
		b3 = a0^xmult(a0)^a1^a2^xmult(a3);

		state[col*4+0] = b0;
		state[col*4+1] = b1;
		state[col*4+2] = b2;
		state[col*4+3] = b3;
	}
} // MixColumn

void Rijndael_GPU::InvMixColumn(void)
{ // poly32_inv used here - we hard coded - todo - defines
	unsigned char a0,a1,a2,a3,b0,b1,b2,b3;
	for (int col = 0; col < Nb; col++)
	{
		a0 = state[4*col+0];
		a1 = state[4*col+1];
		a2 = state[4*col+2];
		a3 = state[4*col+3];

		b0 = GF2_8_mult(0x0E,a0)^GF2_8_mult(0x0B,a1)^
			GF2_8_mult(0x0D,a2)^GF2_8_mult(0x09,a3);
		b1 = GF2_8_mult(0x09,a0)^GF2_8_mult(0x0E,a1)^
			GF2_8_mult(0x0B,a2)^GF2_8_mult(0x0D,a3);
		b2 = GF2_8_mult(0x0D,a0)^GF2_8_mult(0x09,a1)^
			GF2_8_mult(0x0E,a2)^GF2_8_mult(0x0B,a3);
		b3 = GF2_8_mult(0x0B,a0)^GF2_8_mult(0x0D,a1)^
			GF2_8_mult(0x09,a2)^GF2_8_mult(0x0E,a3);

		state[4*col+0] = b0;
		state[4*col+1] = b1;
		state[4*col+2] = b2;
		state[4*col+3] = b3;
	}
} // InvMixColumn

void Rijndael_GPU::AddRoundKey(int round)
{
	const unsigned char * r_ptr = W + round * state_size;
	unsigned char * s_ptr = state;

	for (int pos = 0; pos < state_size; pos++)
		*s_ptr++ ^= *r_ptr++;

} // AddRoundKey

// the round functions
void Rijndael_GPU::Round(int round)
{
	ByteSub();
	ShiftRow();
	MixColumn();
	AddRoundKey(round);
} // Round

void Rijndael_GPU::InvRound(int round)
{
	AddRoundKey(round);
	InvMixColumn();
	InvShiftRow();
	InvByteSub();
} // InvRound

void Rijndael_GPU::FinalRound(int round)
{
	ByteSub();
	ShiftRow();
	AddRoundKey(round);
} // FinalRound

void Rijndael_GPU::InvFinalRound(int round)
{
	AddRoundKey(round);
	InvShiftRow();
	InvByteSub();
} // FinalRound

unsigned long Rijndael_GPU::RotByte(unsigned long data)
{ // bytes (a,b,c,d) -> (b,c,d,a)	so low becomes high
	return (data << 24) | (data >> 8);
	// todo inline with rotr

} // RotByte

unsigned long Rijndael_GPU::SubByte(unsigned long data)
{ // does the SBox on this 4 byte data
	unsigned result = 0;
	result = byte_sub[data>>24];
	result <<= 8;
	result |= byte_sub[(data>>16)&255];
	result <<= 8;
	result |= byte_sub[(data>>8)&255];
	result <<= 8;
	result |= byte_sub[data&255];
	return result;
} // SubByte

// Key expansion code - makes local copy
void Rijndael_GPU::KeyExpansion(const unsigned char * key)
{
	assert(Nk > 0);
	int i;
	unsigned long temp, * Wb = reinterpret_cast<unsigned long*>(W); // todo not portable - Endian problems
	if (Nk <= 6)
	{
		// todo - memcpy
		for (i = 0; i < 4*Nk; i++)
			W[i] = key[i];
		for (i = Nk; i < Nb*(Nr+1); i++)
		{
			temp = Wb[i-1];
			if ((i%Nk) == 0)
				temp = SubByte(RotByte(temp)) ^ Rcon[i/Nk];
			Wb[i] = Wb[i - Nk]^temp;
		}
	}
	else
	{
		// todo - memcpy
		for (i = 0; i < 4*Nk; i++)
			W[i] = key[i]; 
		for (i = Nk; i < Nb*(Nr+1); i++)
		{
			temp = Wb[i-1];
			if ((i%Nk) == 0)
				temp = SubByte(RotByte(temp)) ^ Rcon[i/Nk];
			else if ((i%Nk) == 4)
				temp = SubByte(temp);
			Wb[i] = Wb[i - Nk]^temp;
		}
	}
} // KeyExpansion

void Rijndael_GPU::SetParameters(int keylength, int blocklength)
{
	Nk = Nr = Nb = 0; // default values

	if ((keylength != 128) && (keylength != 192) && (keylength != 256))
		return; // nothing - todo - throw error?
	if ((blocklength != 128) && (blocklength != 192) && (blocklength != 256))
		return; // nothing - todo - throw error?

	// legal parameters, so fire it up
	Nk = keylength / 32;
	Nb = blocklength/32;

	state_size = 4*Nb; // bytes in state vector

	// fill memory
	Nr = parameters[((Nk-4)/2 + 3*(Nb-4)/2)*4+0];
	C1 = parameters[((Nk-4)/2 + 3*(Nb-4)/2)*4+1];
	C2 = parameters[((Nk-4)/2 + 3*(Nb-4)/2)*4+2];
	C3 = parameters[((Nk-4)/2 + 3*(Nb-4)/2)*4+3];
} // SetParameters

void Rijndael_GPU::StartEncryption(const unsigned char * key)
{
	KeyExpansion(key);
} // StartEncryption

void Rijndael_GPU::EncryptBlock(const unsigned char * datain1, unsigned char * dataout1, const unsigned char * states)
{ // todo ? allow in place encryption
	const unsigned long * datain = reinterpret_cast<const unsigned long*>(datain1);
	unsigned long * dataout = reinterpret_cast<unsigned long*>(dataout1);

	memcpy(state,datain,state_size);
	AddRoundKey(0);
	for (int i = 1; i < Nr; i++)
		{
#ifdef __DEBUG
		printf("round %d:\n",i);
#endif
		Round(i);
		if (0 != states)
			{ // compare
			if (memcmp(state,states+(i-1)*state_size,state_size) != 0)
				{
				cerr << "State " << i << " failed:\n";
				cerr << "State     : "; DumpHex(state,state_size);
				cerr << "Should be : "; DumpHex(states+(i-1)*state_size,state_size);
				}
			}
		}
	FinalRound(Nr);
	memcpy(dataout,state,state_size);

	/*
	//TODO: move these to initGPU
	int devID;
	cudaDeviceProp deviceProps;
	devID = cutGetMaxGflopsDeviceId();
	cudaSetDevice( devID );

	// get number of SMs on this GPU
	cutilSafeCall(cudaGetDeviceProperties(&deviceProps, devID));
	//printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);

	unsigned int num_threads = 16;	

	// allocate device memory
	size_t memsize = state_size;

	unsigned char* d_idata;
	cutilSafeCall( cudaMalloc( (void**) &d_idata, memsize));
	// copy host memory to device
	cutilSafeCall( cudaMemcpy( d_idata, datain1, memsize, cudaMemcpyHostToDevice));

	unsigned char* d_ext_key;
	int key_size = Nb*(Nr+1)*4;
	cutilSafeCall( cudaMalloc( (void**) &d_ext_key, key_size));
	// copy host memory to device
	cutilSafeCall( cudaMemcpy( d_ext_key, W, key_size, cudaMemcpyHostToDevice));

	// allocate device memory for result
	unsigned char* d_odata;
	cutilSafeCall( cudaMalloc( (void**) &d_odata, memsize));

	// setup execution parameters
	dim3  grid( 1, 1, 1);
	dim3  threads( num_threads, 1, 1);

	unsigned int timer = 0;
	cutilCheckError( cutCreateTimer( &timer));
	cutilCheckError( cutStartTimer( timer));

	//////////////////////////////////////////////////////////////////
	//Architectures with compute capability 1.x, function
	//cuPrintf() is used. Otherwise, function printf() is called.
	bool use_cuPrintf = (deviceProps.major < 2);

	if (use_cuPrintf)
	{
		//Initializaton, allocate buffers on both host
		//and device for data to be printed.
		cudaPrintfInit();
	}
	/////////////////////////////////////////////////////////////////

	//printf("launching GPU kernel...\n");

	// execute the kernel
	d_Round<<< grid, threads >>>( d_idata, d_odata, d_ext_key, Nr, Nb);
	cutilDeviceSynchronize(); // ???

	// check if kernel execution generated and error
	cutilCheckMsg("Kernel execution failed");


	/////////////////////////////////////////////////////////////
	if (use_cuPrintf)
	{
		//Dump current contents of output buffer to standard 
		//output, and origin (block id and thread id) of each line 
		//of output is enabled(true).
		cudaPrintfDisplay(stdout, true);

		//Free allocated buffers by cudaPrintfInit().
		cudaPrintfEnd();
	}
	//////////////////////////////////////////////////////////////



	cutilSafeCall( cudaMemcpy( state, d_odata, state_size,	cudaMemcpyDeviceToHost) );
	cutilCheckError( cutStopTimer( timer));
	//printf( "GPU Processing time: %f (ms)\n", cutGetTimerValue( timer));
	cutilCheckError( cutDeleteTimer( timer));

	memcpy(dataout,state,state_size);

	cutilSafeCall(cudaFree(d_idata));
	cutilSafeCall(cudaFree(d_ext_key));
	cutilSafeCall(cudaFree(d_odata));

	cutilDeviceReset();
	*/


} // Encrypt

// call this to encrypt any size block
void Rijndael_GPU::Encrypt(const unsigned char * datain, unsigned char * dataout, unsigned long numBlocks, BlockMode mode)
{
	if (0 == numBlocks)
		return;
	unsigned int blocksize = Nb*4;
	switch (mode)
	{
	case ECB :
		while (numBlocks)
		{
			EncryptBlock(datain,dataout);
			datain   += blocksize;
			dataout  += blocksize;
			--numBlocks;
		}
		break;
	case CBC :
		{
			unsigned char buffer[64];
			memset(buffer,0,sizeof(buffer)); // clear out - todo - allow setting the Initialization Vector - needed for security
			while (numBlocks)
			{
				for (unsigned int pos = 0; pos < blocksize; ++pos)
					buffer[pos] ^= *datain++;
				EncryptBlock(buffer,dataout);
				memcpy(buffer,dataout,blocksize);
				dataout  += blocksize;
				--numBlocks;
			}
		}
		break;
	default :
		assert(!"Unknown mode!");
		break;
	}
} // Encrypt

void Rijndael_GPU::StartDecryption(const unsigned char * key)
{
	KeyExpansion(key);
} // StartDecryption

void Rijndael_GPU::DecryptBlock(const unsigned char * datain1, unsigned char * dataout1, const unsigned char * states)
{

	/*
	//TODO: move these to initGPU
	int devID;
	cudaDeviceProp deviceProps;
	devID = cutGetMaxGflopsDeviceId();
	cudaSetDevice( devID );

	// get number of SMs on this GPU
	cutilSafeCall(cudaGetDeviceProperties(&deviceProps, devID));
	//printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);

	unsigned int num_threads = 16;	

	// allocate device memory
	size_t memsize = state_size;

	unsigned char* d_idata;
	cutilSafeCall( cudaMalloc( (void**) &d_idata, memsize));
	// copy host memory to device
	cutilSafeCall( cudaMemcpy( d_idata, datain1, memsize, cudaMemcpyHostToDevice));

	unsigned char* d_ext_key;
	int key_size = Nb*(Nr+1)*4;
	//cutilSafeCall( cudaMalloc( (void**) &d_ext_key, key_size));
	// copy host memory to device
	//cutilSafeCall( cudaMemcpy( d_ext_key, W, key_size, cudaMemcpyHostToDevice));
	cutilSafeCall( cudaMemcpyToSymbol( "d_key_const", W, key_size));

	// allocate device memory for result
	unsigned char* d_odata;
	cutilSafeCall( cudaMalloc( (void**) &d_odata, memsize));

	// setup execution parameters
	dim3  grid( 1, 1, 1);
	dim3  threads( num_threads, 1, 1);

	unsigned int timer = 0;
	cutilCheckError( cutCreateTimer( &timer));
	cutilCheckError( cutStartTimer( timer));

	//////////////////////////////////////////////////////////////////
	//Architectures with compute capability 1.x, function
	//cuPrintf() is used. Otherwise, function printf() is called.
	bool use_cuPrintf = (deviceProps.major < 2);

	if (use_cuPrintf)
	{
		//Initializaton, allocate buffers on both host
		//and device for data to be printed.
		cudaPrintfInit();
	}
	/////////////////////////////////////////////////////////////////

	//printf("launching GPU kernel...\n");

	// execute the kernel
	d_inv_Round<<< grid, threads >>>( d_idata, d_odata, Nr, Nb);
	cutilDeviceSynchronize(); // ???

	// check if kernel execution generated and error
	cutilCheckMsg("Kernel execution failed");


	/////////////////////////////////////////////////////////////
	if (use_cuPrintf)
	{
		//Dump current contents of output buffer to standard 
		//output, and origin (block id and thread id) of each line 
		//of output is enabled(true).
		cudaPrintfDisplay(stdout, true);

		//Free allocated buffers by cudaPrintfInit().
		cudaPrintfEnd();
	}
	//////////////////////////////////////////////////////////////



	cutilSafeCall( cudaMemcpy( state, d_odata, state_size,	cudaMemcpyDeviceToHost) );
	cutilCheckError( cutStopTimer( timer));
	//printf( "GPU Processing time: %f (ms)\n", cutGetTimerValue( timer));
	cutilCheckError( cutDeleteTimer( timer));

	memcpy(dataout,state,state_size);

	cutilSafeCall(cudaFree(d_idata));
	//cutilSafeCall(cudaFree(d_ext_key));
	cutilSafeCall(cudaFree(d_odata));

	//cutilDeviceReset();
	*/

	const unsigned long * datain = reinterpret_cast<const unsigned long*>(datain1);
	unsigned long * dataout = reinterpret_cast<unsigned long*>(dataout1);

	memcpy(state,datain,state_size);
	InvFinalRound(Nr);
	for (int i = Nr-1; i > 0; i--)
	{
		if (0 != states)
		{ // compare
			if (memcmp(state,states+(i-1)*state_size,state_size) != 0)
			{
				cerr << "State " << i << " failed:\n";
				cerr << "State     : "; DumpHex(state,state_size);
				cerr << "Should be : "; DumpHex(states+(i-1)*state_size,state_size);
			}
		}
		InvRound(i);
	}
	AddRoundKey(0);
	memcpy(dataout,state,state_size);

} // Decrypt

// call this to decrypt any size block
void Rijndael_GPU::Decrypt(const unsigned char * datain, unsigned char * dataout, unsigned long numBlocks, BlockMode mode)
{	
	if (0 == numBlocks)
		return;

	unsigned int blocksize = Nb*4;

	////////////////////////////////////////////////////////////////////
	unsigned int threads_per_block = 256;
	size_t num_threads = 256;

	//TODO: move these to initGPU
	int devID;
	cudaDeviceProp deviceProps;
	devID = cutGetMaxGflopsDeviceId();
	cudaSetDevice( devID );

	// get number of SMs on this GPU
	cutilSafeCall(cudaGetDeviceProperties(&deviceProps, devID));
	//printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
	
	// extract device current free memory in Bytes
	unsigned int totalMem;
	unsigned int freeMem;
	cutilSafeCall(cudaMemGetInfo(&freeMem,&totalMem));
	//printf("total free mem: %d\n",freeMem);

	unsigned int totalDataSize =  numBlocks*blocksize;
	unsigned int numOfRounds = ceil((double)totalDataSize / freeMem);
	unsigned int roundMem = (totalDataSize > freeMem ? freeMem : totalDataSize);
	// Data left to digest
	unsigned int DataLeft = totalDataSize;

	// copy expanded key to const memory of device
	unsigned char* d_ext_key;
	int key_size = (Nr+1)*blocksize;			
	cutilSafeCall( cudaMemcpyToSymbol( "d_key_const", W, key_size));
	////////////////////////////////////////////////////////////////////

	switch (mode)
	{
	case ECB :
		
		//printf("ECB mode:\n");

		for(unsigned int currentRound = 0; currentRound<numOfRounds; currentRound++)
		{
			//printf("totalDataSize: %d, numOfRounds: %d, roundmem: %d , currentRound: %d, DataLeft: %d\n",totalDataSize,numOfRounds,roundMem,currentRound,DataLeft);
			const unsigned int grid_dim_limit = deviceProps.maxGridSize[0];
			const unsigned int grid_theoretical_size = ceil((double)roundMem/threads_per_block);
			const unsigned int grid_x = ( grid_theoretical_size<grid_dim_limit ? grid_theoretical_size : grid_dim_limit );
			const unsigned int grid_y = ( ceil((double)grid_theoretical_size/grid_dim_limit) );

			// allocate device memory
			//size_t memsize = roundMem;
			size_t memsize = grid_y*grid_x*256;

			unsigned char* d_idata;
			cutilSafeCall( cudaMalloc( (void**) &d_idata, memsize));
			// copy host memory to device
			cutilSafeCall( cudaMemcpy( d_idata, datain, roundMem, cudaMemcpyHostToDevice));

			// allocate device memory for result
			unsigned char* d_odata;
			cutilSafeCall( cudaMalloc( (void**) &d_odata, memsize));

			
			// setup execution parameters
			dim3  grid( grid_x, grid_y, 1);
			dim3  threads( 16, 16, 1); // 16*16 = 256

			unsigned int timer = 0;
			cutilCheckError( cutCreateTimer( &timer));
			cutilCheckError( cutStartTimer( timer));

			//////////////////////////////////////////////////////////////////
			//Architectures with compute capability 1.x, function
			//cuPrintf() is used. Otherwise, function printf() is called.
			bool use_cuPrintf = (deviceProps.major < 2);

			if (use_cuPrintf)
			{
				//Initializaton, allocate buffers on both host
				//and device for data to be printed.
				cudaPrintfInit();
			}
			/////////////////////////////////////////////////////////////////

			printf("launching GPU kernel...\n");

			// execute the kernel
			d_inv_Round_multiBlock<<< grid, threads >>>( d_idata, d_odata, Nr, Nb);
			cutilDeviceSynchronize(); // ???

			// check if kernel execution generated and error
			cutilCheckMsg("Kernel execution failed");


			/////////////////////////////////////////////////////////////
			if (use_cuPrintf)
			{
				//Dump current contents of output buffer to standard 
				//output, and origin (block id and thread id) of each line 
				//of output is enabled(true).
				cudaPrintfDisplay(stdout, true);

				//Free allocated buffers by cudaPrintfInit().
				cudaPrintfEnd();
			}
			//////////////////////////////////////////////////////////////



			cutilSafeCall( cudaMemcpy( dataout, d_odata, roundMem,	cudaMemcpyDeviceToHost) );
			cutilCheckError( cutStopTimer( timer));
			printf( "GPU Processing time: %f (ms)\n", cutGetTimerValue( timer));
			cutilCheckError( cutDeleteTimer( timer));

			cutilSafeCall(cudaFree(d_idata));
			cutilSafeCall(cudaFree(d_odata));
			
			DataLeft -= roundMem;			
			datain   += roundMem;
			dataout  += roundMem;
			roundMem = ( DataLeft < roundMem ? DataLeft : roundMem );
		}
		break;

	case CBC :
		{
			//printf("CBC mode\n");
			unsigned char buffer[64];
			memset(buffer,0,sizeof(buffer)); // clear out - todo - allow setting the Initialization Vector - needed for security
			DecryptBlock(datain,dataout); // do first block
			for (unsigned int pos = 0; pos < blocksize; ++pos)
				*dataout++ ^= buffer[pos];
			datain += blocksize;
			numBlocks--;

			while (numBlocks)
			{
				DecryptBlock(datain,dataout); // do first block
				for (unsigned int pos = 0; pos < blocksize; ++pos)
					*dataout++ ^= *(datain-blocksize+pos);
				datain  += blocksize;
				--numBlocks;
			}
		}
		break;

	default :
		assert(!"Unknown mode!");
	}

	cutilDeviceReset();

} // Decrypt

// the constructor - makes sure local things are initialized
Rijndael_GPU::Rijndael_GPU(void)
{
	if (false == tablesInitialized)
		tablesInitialized = CreateRijndaelTables(true,false);
	this->initGPUConst();
	if (false == tablesInitialized)
		throw "Tables failed to initialize";
}

Rijndael_GPU::~Rijndael_GPU(void)
{
	cutilDeviceReset();
}

void Rijndael_GPU::initGPUConst(void){
	cutilSafeCall( cudaMemcpyToSymbol( "d_byte_sub_const", &byte_sub, 256) );
	cutilSafeCall( cudaMemcpyToSymbol( "d_gf2_8_inv_const", &gf2_8_inv, 256) );
	cutilSafeCall( cudaMemcpyToSymbol( "d_inv_byte_sub_const", &inv_byte_sub, 256) );
	cutilSafeCall( cudaMemcpyToSymbol( "d_Rcon_const", &Rcon, 60*sizeof(unsigned long)));
	cutilSafeCall( cudaMemcpyToSymbol( "d_shift_row_map_const", &shift_row_map, 16));
	cutilSafeCall( cudaMemcpyToSymbol( "d_inv_shift_row_map_const", &inv_shift_row_map, 16));

	cutilSafeCall( cudaMemcpyToSymbol( "d_mult_by_2_const", &mult_by_2, 256));
	cutilSafeCall( cudaMemcpyToSymbol( "d_mult_by_3_const", &mult_by_3, 256));
	cutilSafeCall( cudaMemcpyToSymbol( "d_mult_by_9_const", &mult_by_9, 256));
	cutilSafeCall( cudaMemcpyToSymbol( "d_mult_by_11_const", &mult_by_11, 256));
	cutilSafeCall( cudaMemcpyToSymbol( "d_mult_by_13_const", &mult_by_13, 256));
	cutilSafeCall( cudaMemcpyToSymbol( "d_mult_by_14_const", &mult_by_14, 256));
	
	//	runKernelTest();
}

// end - Rijndael_GPU.cu

#endif // RIJNDAEL_GPU_CU